import gradio as gr
import datetime
import base64
import numpy as np
import dashscope
import os
from dotenv import load_dotenv

# 加载 .env 文件
load_dotenv()

API_KEY = os.environ.get('API_KEY')
if not API_KEY:
    raise ValueError("请设置 API_KEY 环境变量或在 .env 文件中配置")

VOICE_OPTIONS = {
    "Cherry / 芊悦": "Cherry",
    "Ethan / 晨煦": "Ethan",
    "Jennifer / 詹妮弗": "Jennifer",
    "Ryan / 甜茶": "Ryan",
    "Katerina / 卡捷琳娜": "Katerina",
    "Nofish / 不吃鱼": "Nofish",
    "Elias / 墨讲师": "Elias",
    "Li / 南京-老李": "Li", 
    "Marcus / 陕西-秦川": "Marcus", 
    "Roy / 闽南-阿杰": "Roy", 
    "Peter / 天津-李彼得": "Peter", 
    "Eric / 四川-程川": "Eric", 
    "Rocky / 粤语-阿强": "Rocky",
    "Kiki / 粤语-阿清": "Kiki",
    "Sunny / 四川-晴儿": "Sunny",
    "Jada / 上海-阿珍": "Jada",
    "Dylan / 北京-晓东": "Dylan",
}
DEFAULT_VOICE = 'Cherry / 芊悦'

LANGUAGE_OPTIONS = [
    "Auto / 自动", 
    "English / 英文", 
    "Chinese / 中文", 
    "German / 德语", 
    "Italian / 意大利语", 
    "Portuguese / 葡萄牙语", 
    "Spanish / 西班牙语", 
    "Japanese / 日语", 
    "Korean / 韩语", 
    "French / 法语", 
    "Russian / 俄语"
]

LANGUAGE_MAP = {
    "Auto / 自动": "Auto",
    "English / 英文": "English",
    "Chinese / 中文": "Chinese",
    "German / 德语": "German",
    "Italian / 意大利语": "Italian",
    "Portuguese / 葡萄牙语": "Portuguese",
    "Spanish / 西班牙语": "Spanish",
    "Japanese / 日语": "Japanese",
    "Korean / 韩语": "Korean",
    "French / 法语": "French",
    "Russian / 俄语": "Russian"
}

def tts_interface(text, voice_display, language_display):
    voice_name = VOICE_OPTIONS[voice_display]
    
    # 将显示的语言转换为API参数
    language = LANGUAGE_MAP[language_display]
    
    print(f"text: {text}, {voice_name}, {language} time: {datetime.datetime.now()}\n")

    audio_frames = []

    responses = dashscope.MultiModalConversation.call(
        api_key=API_KEY,
        model="qwen3-tts-flash",
        text=text,
        voice=voice_name,
        stream=True,
        language_type=language
    )
    
    for chunk in responses:
        audio_string = ""
        try:
            audio_string = chunk.output.audio.data
        except:
            print(chunk)
            pass
        wav_bytes = base64.b64decode(audio_string)
        audio_np = np.frombuffer(wav_bytes, dtype=np.int16).astype(np.float32) / 32768.0
        audio_frames.append(audio_np)

    if audio_frames:
        full_audio = np.concatenate(audio_frames)
    else:
        full_audio = None

    sample_rate = 24000
    return (sample_rate, full_audio)

with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]), css=".gradio-container {max-width: none !important;}") as demo:
    gr.Markdown("# 🎤 Qwen3-TTS Demo")
    
    with gr.Row():
        with gr.Column():
            # 输入文本 - 英文在前
            text_input = gr.Textbox(
                label="Input Text / 输入文本",
                placeholder="Enter text to synthesis here... / 在此输入要合成为语音的文本...",
                lines=4,
                max_lines=8
            )
            
            # 发音人选择 - 英文在前
            voice_select = gr.Dropdown(
                label="Select Voice / 选择发音人",
                choices=list(VOICE_OPTIONS.keys()),
                value=DEFAULT_VOICE
            )
            
            # 语言选择 - 英文在前
            language_select = gr.Dropdown(
                label="Select Text Language / 选择文本语言",
                choices=LANGUAGE_OPTIONS,
                value="Auto / 自动"
            )
            
            # 生成按钮 - 英文在前
            generate_btn = gr.Button("Generate Speech / 生成语音", variant="primary")
        
        with gr.Column():
            # 音频输出 - 英文在前
            audio_output = gr.Audio(label="Generated Speech / 生成的语音", interactive=False)
    
    # 示例文本 - 英文在前
    examples = gr.Examples(
        examples=[
            ["你好，我是通义千问，很高兴认识你。", "Cherry / 芊悦", "Chinese / 中文"],
            ["你好，我是通义千问，很高兴认识你。", "Dylan / 北京-晓东", "Chinese / 中文"],
            ["Hello, this is a text-to-speech demo", "Jennifer / 詹妮弗", "English / 英文"],
            ["こんにちは、これはデモです", "Cherry / 芊悦", "Japanese / 日语"],
        ],
        inputs=[text_input, voice_select, language_select],
        label="Examples / 示例文本"
    )

    generate_btn.click(
        fn=tts_interface,
        inputs=[text_input, voice_select, language_select],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(share=True)